/*
 * AES-NI + SSE2 implementation of AEGIS-128
 *
 * Copyright (c) 2017-2018 Ondrej Mosnacek <omosnacek@gmail.com>
 * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 as published
 * by the Free Software Foundation.
 */

#include <linux/linkage.h>
#include <asm/frame.h>

#define STATE0	%xmm0
#define STATE1	%xmm1
#define STATE2	%xmm2
#define STATE3	%xmm3
#define STATE4	%xmm4
#define KEY	%xmm5
#define MSG	%xmm5
#define T0	%xmm6
#define T1	%xmm7

#define STATEP	%rdi
#define LEN	%rsi
#define SRC	%rdx
#define DST	%rcx

.section .rodata.cst16.aegis128_const, "aM", @progbits, 32
.align 16
.Laegis128_const_0:
	.byte 0x00, 0x01, 0x01, 0x02, 0x03, 0x05, 0x08, 0x0d
	.byte 0x15, 0x22, 0x37, 0x59, 0x90, 0xe9, 0x79, 0x62
.Laegis128_const_1:
	.byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1
	.byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd

.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16
.align 16
.Laegis128_counter:
	.byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
	.byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f

.text

/*
 * aegis128_update
 * input:
 *   STATE[0-4] - input state
 * output:
 *   STATE[0-4] - output state (shifted positions)
 * changed:
 *   T0
 */
.macro aegis128_update
	movdqa STATE4, T0
	aesenc STATE0, STATE4
	aesenc STATE1, STATE0
	aesenc STATE2, STATE1
	aesenc STATE3, STATE2
	aesenc T0,     STATE3
.endm

/*
 * __load_partial: internal ABI
 * input:
 *   LEN - bytes
 *   SRC - src
 * output:
 *   MSG  - message block
 * changed:
 *   T0
 *   %r8
 *   %r9
 */
__load_partial:
	xor %r9d, %r9d
	pxor MSG, MSG

	mov LEN, %r8
	and $0x1, %r8
	jz .Lld_partial_1

	mov LEN, %r8
	and $0x1E, %r8
	add SRC, %r8
	mov (%r8), %r9b

.Lld_partial_1:
	mov LEN, %r8
	and $0x2, %r8
	jz .Lld_partial_2

	mov LEN, %r8
	and $0x1C, %r8
	add SRC, %r8
	shl $0x10, %r9
	mov (%r8), %r9w

.Lld_partial_2:
	mov LEN, %r8
	and $0x4, %r8
	jz .Lld_partial_4

	mov LEN, %r8
	and $0x18, %r8
	add SRC, %r8
	shl $32, %r9
	mov (%r8), %r8d
	xor %r8, %r9

.Lld_partial_4:
	movq %r9, MSG

	mov LEN, %r8
	and $0x8, %r8
	jz .Lld_partial_8

	mov LEN, %r8
	and $0x10, %r8
	add SRC, %r8
	pslldq $8, MSG
	movq (%r8), T0
	pxor T0, MSG

.Lld_partial_8:
	ret
ENDPROC(__load_partial)

/*
 * __store_partial: internal ABI
 * input:
 *   LEN - bytes
 *   DST - dst
 * output:
 *   T0   - message block
 * changed:
 *   %r8
 *   %r9
 *   %r10
 */
__store_partial:
	mov LEN, %r8
	mov DST, %r9

	movq T0, %r10

	cmp $8, %r8
	jl .Lst_partial_8

	mov %r10, (%r9)
	psrldq $8, T0
	movq T0, %r10

	sub $8, %r8
	add $8, %r9

.Lst_partial_8:
	cmp $4, %r8
	jl .Lst_partial_4

	mov %r10d, (%r9)
	shr $32, %r10

	sub $4, %r8
	add $4, %r9

.Lst_partial_4:
	cmp $2, %r8
	jl .Lst_partial_2

	mov %r10w, (%r9)
	shr $0x10, %r10

	sub $2, %r8
	add $2, %r9

.Lst_partial_2:
	cmp $1, %r8
	jl .Lst_partial_1

	mov %r10b, (%r9)

.Lst_partial_1:
	ret
ENDPROC(__store_partial)

/*
 * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv);
 */
ENTRY(crypto_aegis128_aesni_init)
	FRAME_BEGIN

	/* load IV: */
	movdqu (%rdx), T1

	/* load key: */
	movdqa (%rsi), KEY
	pxor KEY, T1
	movdqa T1, STATE0
	movdqa KEY, STATE3
	movdqa KEY, STATE4

	/* load the constants: */
	movdqa .Laegis128_const_0, STATE2
	movdqa .Laegis128_const_1, STATE1
	pxor STATE2, STATE3
	pxor STATE1, STATE4

	/* update 10 times with KEY / KEY xor IV: */
	aegis128_update; pxor KEY, STATE4
	aegis128_update; pxor T1,  STATE3
	aegis128_update; pxor KEY, STATE2
	aegis128_update; pxor T1,  STATE1
	aegis128_update; pxor KEY, STATE0
	aegis128_update; pxor T1,  STATE4
	aegis128_update; pxor KEY, STATE3
	aegis128_update; pxor T1,  STATE2
	aegis128_update; pxor KEY, STATE1
	aegis128_update; pxor T1,  STATE0

	/* store the state: */
	movdqu STATE0, 0x00(STATEP)
	movdqu STATE1, 0x10(STATEP)
	movdqu STATE2, 0x20(STATEP)
	movdqu STATE3, 0x30(STATEP)
	movdqu STATE4, 0x40(STATEP)

	FRAME_END
	ret
ENDPROC(crypto_aegis128_aesni_init)

/*
 * void crypto_aegis128_aesni_ad(void *state, unsigned int length,
 *                               const void *data);
 */
ENTRY(crypto_aegis128_aesni_ad)
	FRAME_BEGIN

	cmp $0x10, LEN
	jb .Lad_out

	/* load the state: */
	movdqu 0x00(STATEP), STATE0
	movdqu 0x10(STATEP), STATE1
	movdqu 0x20(STATEP), STATE2
	movdqu 0x30(STATEP), STATE3
	movdqu 0x40(STATEP), STATE4

	mov SRC, %r8
	and $0xF, %r8
	jnz .Lad_u_loop

.align 8
.Lad_a_loop:
	movdqa 0x00(SRC), MSG
	aegis128_update
	pxor MSG, STATE4
	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lad_out_1

	movdqa 0x10(SRC), MSG
	aegis128_update
	pxor MSG, STATE3
	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lad_out_2

	movdqa 0x20(SRC), MSG
	aegis128_update
	pxor MSG, STATE2
	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lad_out_3

	movdqa 0x30(SRC), MSG
	aegis128_update
	pxor MSG, STATE1
	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lad_out_4

	movdqa 0x40(SRC), MSG
	aegis128_update
	pxor MSG, STATE0
	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lad_out_0

	add $0x50, SRC
	jmp .Lad_a_loop

.align 8
.Lad_u_loop:
	movdqu 0x00(SRC), MSG
	aegis128_update
	pxor MSG, STATE4
	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lad_out_1

	movdqu 0x10(SRC), MSG
	aegis128_update
	pxor MSG, STATE3
	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lad_out_2

	movdqu 0x20(SRC), MSG
	aegis128_update
	pxor MSG, STATE2
	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lad_out_3

	movdqu 0x30(SRC), MSG
	aegis128_update
	pxor MSG, STATE1
	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lad_out_4

	movdqu 0x40(SRC), MSG
	aegis128_update
	pxor MSG, STATE0
	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lad_out_0

	add $0x50, SRC
	jmp .Lad_u_loop

	/* store the state: */
.Lad_out_0:
	movdqu STATE0, 0x00(STATEP)
	movdqu STATE1, 0x10(STATEP)
	movdqu STATE2, 0x20(STATEP)
	movdqu STATE3, 0x30(STATEP)
	movdqu STATE4, 0x40(STATEP)
	FRAME_END
	ret

.Lad_out_1:
	movdqu STATE4, 0x00(STATEP)
	movdqu STATE0, 0x10(STATEP)
	movdqu STATE1, 0x20(STATEP)
	movdqu STATE2, 0x30(STATEP)
	movdqu STATE3, 0x40(STATEP)
	FRAME_END
	ret

.Lad_out_2:
	movdqu STATE3, 0x00(STATEP)
	movdqu STATE4, 0x10(STATEP)
	movdqu STATE0, 0x20(STATEP)
	movdqu STATE1, 0x30(STATEP)
	movdqu STATE2, 0x40(STATEP)
	FRAME_END
	ret

.Lad_out_3:
	movdqu STATE2, 0x00(STATEP)
	movdqu STATE3, 0x10(STATEP)
	movdqu STATE4, 0x20(STATEP)
	movdqu STATE0, 0x30(STATEP)
	movdqu STATE1, 0x40(STATEP)
	FRAME_END
	ret

.Lad_out_4:
	movdqu STATE1, 0x00(STATEP)
	movdqu STATE2, 0x10(STATEP)
	movdqu STATE3, 0x20(STATEP)
	movdqu STATE4, 0x30(STATEP)
	movdqu STATE0, 0x40(STATEP)
	FRAME_END
	ret

.Lad_out:
	FRAME_END
	ret
ENDPROC(crypto_aegis128_aesni_ad)

.macro encrypt_block a s0 s1 s2 s3 s4 i
	movdq\a (\i * 0x10)(SRC), MSG
	movdqa MSG, T0
	pxor \s1, T0
	pxor \s4, T0
	movdqa \s2, T1
	pand \s3, T1
	pxor T1, T0
	movdq\a T0, (\i * 0x10)(DST)

	aegis128_update
	pxor MSG, \s4

	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Lenc_out_\i
.endm

/*
 * void crypto_aegis128_aesni_enc(void *state, unsigned int length,
 *                                const void *src, void *dst);
 */
ENTRY(crypto_aegis128_aesni_enc)
	FRAME_BEGIN

	cmp $0x10, LEN
	jb .Lenc_out

	/* load the state: */
	movdqu 0x00(STATEP), STATE0
	movdqu 0x10(STATEP), STATE1
	movdqu 0x20(STATEP), STATE2
	movdqu 0x30(STATEP), STATE3
	movdqu 0x40(STATEP), STATE4

	mov  SRC,  %r8
	or   DST,  %r8
	and $0xF, %r8
	jnz .Lenc_u_loop

.align 8
.Lenc_a_loop:
	encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
	encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
	encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
	encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
	encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4

	add $0x50, SRC
	add $0x50, DST
	jmp .Lenc_a_loop

.align 8
.Lenc_u_loop:
	encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
	encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
	encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
	encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
	encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4

	add $0x50, SRC
	add $0x50, DST
	jmp .Lenc_u_loop

	/* store the state: */
.Lenc_out_0:
	movdqu STATE4, 0x00(STATEP)
	movdqu STATE0, 0x10(STATEP)
	movdqu STATE1, 0x20(STATEP)
	movdqu STATE2, 0x30(STATEP)
	movdqu STATE3, 0x40(STATEP)
	FRAME_END
	ret

.Lenc_out_1:
	movdqu STATE3, 0x00(STATEP)
	movdqu STATE4, 0x10(STATEP)
	movdqu STATE0, 0x20(STATEP)
	movdqu STATE1, 0x30(STATEP)
	movdqu STATE2, 0x40(STATEP)
	FRAME_END
	ret

.Lenc_out_2:
	movdqu STATE2, 0x00(STATEP)
	movdqu STATE3, 0x10(STATEP)
	movdqu STATE4, 0x20(STATEP)
	movdqu STATE0, 0x30(STATEP)
	movdqu STATE1, 0x40(STATEP)
	FRAME_END
	ret

.Lenc_out_3:
	movdqu STATE1, 0x00(STATEP)
	movdqu STATE2, 0x10(STATEP)
	movdqu STATE3, 0x20(STATEP)
	movdqu STATE4, 0x30(STATEP)
	movdqu STATE0, 0x40(STATEP)
	FRAME_END
	ret

.Lenc_out_4:
	movdqu STATE0, 0x00(STATEP)
	movdqu STATE1, 0x10(STATEP)
	movdqu STATE2, 0x20(STATEP)
	movdqu STATE3, 0x30(STATEP)
	movdqu STATE4, 0x40(STATEP)
	FRAME_END
	ret

.Lenc_out:
	FRAME_END
	ret
ENDPROC(crypto_aegis128_aesni_enc)

/*
 * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length,
 *                                     const void *src, void *dst);
 */
ENTRY(crypto_aegis128_aesni_enc_tail)
	FRAME_BEGIN

	/* load the state: */
	movdqu 0x00(STATEP), STATE0
	movdqu 0x10(STATEP), STATE1
	movdqu 0x20(STATEP), STATE2
	movdqu 0x30(STATEP), STATE3
	movdqu 0x40(STATEP), STATE4

	/* encrypt message: */
	call __load_partial

	movdqa MSG, T0
	pxor STATE1, T0
	pxor STATE4, T0
	movdqa STATE2, T1
	pand STATE3, T1
	pxor T1, T0

	call __store_partial

	aegis128_update
	pxor MSG, STATE4

	/* store the state: */
	movdqu STATE4, 0x00(STATEP)
	movdqu STATE0, 0x10(STATEP)
	movdqu STATE1, 0x20(STATEP)
	movdqu STATE2, 0x30(STATEP)
	movdqu STATE3, 0x40(STATEP)

	FRAME_END
	ret
ENDPROC(crypto_aegis128_aesni_enc_tail)

.macro decrypt_block a s0 s1 s2 s3 s4 i
	movdq\a (\i * 0x10)(SRC), MSG
	pxor \s1, MSG
	pxor \s4, MSG
	movdqa \s2, T1
	pand \s3, T1
	pxor T1, MSG
	movdq\a MSG, (\i * 0x10)(DST)

	aegis128_update
	pxor MSG, \s4

	sub $0x10, LEN
	cmp $0x10, LEN
	jl .Ldec_out_\i
.endm

/*
 * void crypto_aegis128_aesni_dec(void *state, unsigned int length,
 *                                const void *src, void *dst);
 */
ENTRY(crypto_aegis128_aesni_dec)
	FRAME_BEGIN

	cmp $0x10, LEN
	jb .Ldec_out

	/* load the state: */
	movdqu 0x00(STATEP), STATE0
	movdqu 0x10(STATEP), STATE1
	movdqu 0x20(STATEP), STATE2
	movdqu 0x30(STATEP), STATE3
	movdqu 0x40(STATEP), STATE4

	mov  SRC, %r8
	or   DST, %r8
	and $0xF, %r8
	jnz .Ldec_u_loop

.align 8
.Ldec_a_loop:
	decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0
	decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1
	decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2
	decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3
	decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4

	add $0x50, SRC
	add $0x50, DST
	jmp .Ldec_a_loop

.align 8
.Ldec_u_loop:
	decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0
	decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1
	decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2
	decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3
	decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4

	add $0x50, SRC
	add $0x50, DST
	jmp .Ldec_u_loop

	/* store the state: */
.Ldec_out_0:
	movdqu STATE4, 0x00(STATEP)
	movdqu STATE0, 0x10(STATEP)
	movdqu STATE1, 0x20(STATEP)
	movdqu STATE2, 0x30(STATEP)
	movdqu STATE3, 0x40(STATEP)
	FRAME_END
	ret

.Ldec_out_1:
	movdqu STATE3, 0x00(STATEP)
	movdqu STATE4, 0x10(STATEP)
	movdqu STATE0, 0x20(STATEP)
	movdqu STATE1, 0x30(STATEP)
	movdqu STATE2, 0x40(STATEP)
	FRAME_END
	ret

.Ldec_out_2:
	movdqu STATE2, 0x00(STATEP)
	movdqu STATE3, 0x10(STATEP)
	movdqu STATE4, 0x20(STATEP)
	movdqu STATE0, 0x30(STATEP)
	movdqu STATE1, 0x40(STATEP)
	FRAME_END
	ret

.Ldec_out_3:
	movdqu STATE1, 0x00(STATEP)
	movdqu STATE2, 0x10(STATEP)
	movdqu STATE3, 0x20(STATEP)
	movdqu STATE4, 0x30(STATEP)
	movdqu STATE0, 0x40(STATEP)
	FRAME_END
	ret

.Ldec_out_4:
	movdqu STATE0, 0x00(STATEP)
	movdqu STATE1, 0x10(STATEP)
	movdqu STATE2, 0x20(STATEP)
	movdqu STATE3, 0x30(STATEP)
	movdqu STATE4, 0x40(STATEP)
	FRAME_END
	ret

.Ldec_out:
	FRAME_END
	ret
ENDPROC(crypto_aegis128_aesni_dec)

/*
 * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length,
 *                                     const void *src, void *dst);
 */
ENTRY(crypto_aegis128_aesni_dec_tail)
	FRAME_BEGIN

	/* load the state: */
	movdqu 0x00(STATEP), STATE0
	movdqu 0x10(STATEP), STATE1
	movdqu 0x20(STATEP), STATE2
	movdqu 0x30(STATEP), STATE3
	movdqu 0x40(STATEP), STATE4

	/* decrypt message: */
	call __load_partial

	pxor STATE1, MSG
	pxor STATE4, MSG
	movdqa STATE2, T1
	pand STATE3, T1
	pxor T1, MSG

	movdqa MSG, T0
	call __store_partial

	/* mask with byte count: */
	movq LEN, T0
	punpcklbw T0, T0
	punpcklbw T0, T0
	punpcklbw T0, T0
	punpcklbw T0, T0
	movdqa .Laegis128_counter, T1
	pcmpgtb T1, T0
	pand T0, MSG

	aegis128_update
	pxor MSG, STATE4

	/* store the state: */
	movdqu STATE4, 0x00(STATEP)
	movdqu STATE0, 0x10(STATEP)
	movdqu STATE1, 0x20(STATEP)
	movdqu STATE2, 0x30(STATEP)
	movdqu STATE3, 0x40(STATEP)

	FRAME_END
	ret
ENDPROC(crypto_aegis128_aesni_dec_tail)

/*
 * void crypto_aegis128_aesni_final(void *state, void *tag_xor,
 *                                  u64 assoclen, u64 cryptlen);
 */
ENTRY(crypto_aegis128_aesni_final)
	FRAME_BEGIN

	/* load the state: */
	movdqu 0x00(STATEP), STATE0
	movdqu 0x10(STATEP), STATE1
	movdqu 0x20(STATEP), STATE2
	movdqu 0x30(STATEP), STATE3
	movdqu 0x40(STATEP), STATE4

	/* prepare length block: */
	movq %rdx, MSG
	movq %rcx, T0
	pslldq $8, T0
	pxor T0, MSG
	psllq $3, MSG /* multiply by 8 (to get bit count) */

	pxor STATE3, MSG

	/* update state: */
	aegis128_update; pxor MSG, STATE4
	aegis128_update; pxor MSG, STATE3
	aegis128_update; pxor MSG, STATE2
	aegis128_update; pxor MSG, STATE1
	aegis128_update; pxor MSG, STATE0
	aegis128_update; pxor MSG, STATE4
	aegis128_update; pxor MSG, STATE3

	/* xor tag: */
	movdqu (%rsi), MSG

	pxor STATE0, MSG
	pxor STATE1, MSG
	pxor STATE2, MSG
	pxor STATE3, MSG
	pxor STATE4, MSG

	movdqu MSG, (%rsi)

	FRAME_END
	ret
ENDPROC(crypto_aegis128_aesni_final)
